{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T16:05:14.906617Z",
     "iopub.status.busy": "2023-09-15T16:05:14.906325Z",
     "iopub.status.idle": "2023-09-15T16:05:15.135189Z",
     "shell.execute_reply": "2023-09-15T16:05:15.134400Z"
    }
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "import pickle\n",
    "import polars as pl\n",
    "\n",
    "# CHANGE\n",
    "PATH_OUTPUT = '' # curated data\n",
    "\n",
    "pl.Config.set_fmt_str_lengths(100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T16:05:15.139251Z",
     "iopub.status.busy": "2023-09-15T16:05:15.138975Z",
     "iopub.status.idle": "2023-09-15T16:05:15.147024Z",
     "shell.execute_reply": "2023-09-15T16:05:15.146430Z"
    }
   },
   "outputs": [],
   "source": [
    "# mapping company names to naics codes for firms with more than 5000 vacancies in Indeed and matching score < 0.80\n",
    "\n",
    "mapping_company_names = {\n",
    "    'rbc' : '522110', # royal bank has no other naics\n",
    "    'bmo financial' : '522110', #bank of montreal, bmo banque de montreal both have 522110 as naics but one of them has a naics 622110(location is now permanenetly closed)\n",
    "    'shoppers drug mart pharmaprix' : '446110', #shoppers drug mart, pharmaprix both have this naics \n",
    "    'chartwell retirement residences' : '623',  #chartwell kanata retirement residence\n",
    "    'garda world' : '541', # gardaworld protective services(2 NAICS: 5418, 1NAICS: 443142), gagardaworld protective services NAICS: 5418, garda security NAICS: 5412\n",
    "    'cisss de la monteregie est' : '62211', #cisss centre integre de sante et de services sociaux de chaudiere appalaches : 622111\n",
    "                                            #cisss centre integre de sante et de services sociaux des iles : 622111\n",
    "                                            #cisss centre integre de sante et de services sociaux des laurentides : 622111\n",
    "                                            #cisss de laval : 622110\n",
    "    'marriott international' : '721110', # marriott hotels have this naics, some of them are: fairfield inn suites by marriott, residence inn by marriott,\n",
    "                                        #towneplace suites by marriott, marriott hotels                                  \n",
    "    'unity health toronto' : '62211', #Unity Health Toronto is comprised of these:\n",
    "                                #st michaels hospital (621111 & 622110), st josephs health centre (622110) , providence healthcare (622110) \n",
    "    'ciusss du centre sud de lile de montreal' : '622110', #ciusss de louest de lile de montreal # does not seem to be same location but no other match for ciusss \n",
    "    'vigi sante' : '623',# vigi sante ltee\n",
    "    'hamilton health sciences' : '622110', # health sciences centre, london health sciences centre lhsc, sunnybrook health sciences centre\t\n",
    "    'olymel' : '4452', # olymel sec lp : 445210 , olymel food : 445299\n",
    "    'tjx' : '44', # winners : 448150, marshalls : 448140, homesense : 442299 \n",
    "    #'mosaic north america' : '611630', # mosaic - LIKELY WRONG\n",
    "    'lowes home improvement' : '444130', #lowes\n",
    "    'dominos' : '722513', # dominos pizza(608 naics :722513, 2 others :722511)\n",
    "    #'als' : '3231', #als society of ontario - LIKELY WRONG\n",
    "    'the good samaritan society' : '62', # good samaritan : 6233, 621111\n",
    "    'ey' : '5412', # ernst young llp: 541219, ernst young : 5412\n",
    "    'ciusss du nord de lile de montreal' : '622110', # 'ciusss de louest de lile de montreal'\n",
    "    'hm' : '4481', #'h m',\n",
    "    'alberta precision labs' : '6215', # lberta precision laboratories\n",
    "    #'goeasy' : ['easyfinancial', 'easyhome', 'lendcare'], # goeasy ltd operates with three business units \n",
    "    'cogir immobilier' : '531311', #cogir \n",
    "    'canopy growth' : '31231',  # Cannabis product manufacturing\n",
    "    'fairmont hotels resorts' : '721110', #'the fairmont hotel vancouver'\n",
    "    'morneau shepell' : '5416', # solutions mieux etre lifeworks auparavant morneau shepell\t\n",
    "    'red robin' : '722513',  # red robin gourmet burgers\n",
    "    'the estee lauder companies' : '446120', # estee lauder :446120, estee lauder cosmetics:81211 UNSURE \n",
    "    'industrielle alliance' : '52', # industrielle alliance assurance et services fina nciers :\t524210;\n",
    "                                   # industrielle alliance assurance et services financie : 523930, 524210, 531210\t\n",
    "    \n",
    "    'patrick morin' : '444130', #patrick morin building centre\n",
    "    'ciusss de la mauricie et du centre du quebec' : '622110', #ciusss de louest de lile de montreal # does not seem to be same location but no other match for ciusss\n",
    "    'dhl' : '49', # dhl airport service center, dhl authorized ship centre, dhl express,dhl service point :492110; dhl distribution :493110\t\n",
    "    'shannex' : '621111', #shannex health care\n",
    "    'savers value village' : '453310', # value village thrift stores: 453310\t\n",
    "    'cisss chaudiere appalaches' : '62211', #assuming that it's same as above\n",
    "    # 'exp' : # EXP: Engineering, Architecture, Design and Consulting; no idea what naics it is\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T16:05:15.150145Z",
     "iopub.status.busy": "2023-09-15T16:05:15.149884Z",
     "iopub.status.idle": "2023-09-15T16:05:15.591661Z",
     "shell.execute_reply": "2023-09-15T16:05:15.590771Z"
    }
   },
   "outputs": [],
   "source": [
    "preprocess_mapper = pl.read_parquet(PATH_OUTPUT + \"preprocess_mapper.parquet\")\n",
    "\n",
    "df_mapping_company_name_to_naics = (\n",
    "    pl.DataFrame(\n",
    "        [series for series in zip(*mapping_company_names.items())],\n",
    "        schema=['company_name', 'naics']\n",
    "    )\n",
    "    # use cleaned_name for next stage\n",
    "    .join(preprocess_mapper, on='company_name', how='left')\n",
    "    .drop('company_name')\n",
    "    .with_columns(\n",
    "        [\n",
    "            pl.col('naics').apply(lambda x: [int(x)]),\n",
    "            pl.lit(True).alias('manual')\n",
    "        ]\n",
    "    )\n",
    ")\n",
    "\n",
    "df_mapping_company_name_to_naics.write_parquet(PATH_OUTPUT + \"df_mapping_cleaned_name_to_naics.parquet\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.12 ('env_indeed2')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "7120590dfa35e6512fb14e5e70b67446c3e78c7a5c027e908dbb14d6a3f8a0eb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
